In [4]:
import pickle
fileObject = open("./data/preprocessing",'r')  
df_tablet = pickle.load(fileObject)

In [5]:
df_X = df_tablet[[ 
        'word', 'sentence', 'flesch reading', 'word/sentence', 'syllable', 'display', 'sound', 'os', 'security', 'hardware', 'battery', 'bug', 'price', 'cs', 'wifi', 'accesory', 'app', 'compatible', 'depth', 'depth/word', 'usable', 'total topic', 'redundancy', 'redundancy/sentence', 'rank', 'pos topic', 'neg topic', 'density'
    ]]
df_y = df_tablet[['help class']]

In [6]:
from sklearn.cross_validation import train_test_split

X = df_X.as_matrix(columns=None)
y = np.ravel(df_y.values)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=33)

RandomForest Parameter Tuning

=================================================================================================================

  • Find optimal parameters through grid search and compare them with those obtained by greedy method
  • Among numerous scoring types such as recall, accuracy, precision, f1 score, etc, this project will focus on recall-related scores.
  • This is because, I believe, high 'true positive rate (tpr)' can be the clue to solve the problem I defined in this project

In [9]:
# baseline confirmation, implying that model has to perform at least as good as it
from sklearn.dummy import DummyClassifier 
clf_Dummy = DummyClassifier(strategy='most_frequent')
clf_Dummy = clf_Dummy.fit(X_train, y_train)
print('baseline score =>', round(clf_Dummy.score(X_test, y_test), 2))


baseline score => 0.51

===================================================================================================================

1. Find optimal parameter

1) Greedy Method (sequential parameter tuning)

  • good point of sequential parameter tuing is that I can see the how certain parameter affects the performance.
  • ...however, I found this not very useful when finding the optimal combination of parameters.
  • ...so using this method after grid search would be fast and nice

(1) n_estimators


In [506]:
from sklearn.metrics import recall_score
from sklearn.ensemble import RandomForestClassifier
from matplotlib.pyplot import axvline, axhline

recall_range = []
n_estimator_range = []

for i in np.arange(10, 20, 1):
    clf_RF = RandomForestClassifier(oob_score=True, n_estimators=i).fit(X_train, y_train)
    clf_RF_predicted = clf_RF.predict(X_test)
    recall = round(recall_score(clf_RF_predicted, y_test), 2)
    
    n_estimator_range.append(i)
    recall_range.append(recall)
    dictionary = dict(zip(n_estimator_range, recall_range))
    
plt.figure(figsize=(10, 3))
plt.plot(n_estimator_range, recall_range, color='#EA5959', label='max recall: %(n)0.2f \n%(s)s: %(v)2d' %  
         {'n':max(dictionary.values()), 's':'n estimator', 'v':max(dictionary, key=lambda i: dictionary[i])})
plt.scatter([max(dictionary, key=lambda i: dictionary[i]), ], [max(dictionary.values()), ], 80, color='#EA5959')
axhline(max(dictionary.values()), color='#EA5959', linewidth=1, linestyle='--')
axvline(max(dictionary, key=lambda i: dictionary[i]), color='#EA5959', linewidth=1, linestyle='--')
plt.legend(loc='lower right', prop={'size':12}) 
plt.xlim(min(n_estimator_range), max(n_estimator_range))
plt.ylim(min(recall_range)*0.98, max(recall_range)*1.02)
plt.ylabel('Recall')
plt.xlabel('n estimator');


(2) max features


In [511]:
recall_range = []
max_features_range = []

for i in np.arange(1, 15, 1):
    clf_RF = RandomForestClassifier(oob_score=True, n_estimators=18, max_features=i).fit(X_train, y_train)
    clf_RF_predicted = clf_RF.predict(X_test)
    recall = round(recall_score(clf_RF_predicted, y_test), 2)
    
    max_features_range.append(i)
    recall_range.append(recall)
    dictionary = dict(zip(max_features_range, recall_range))
    
plt.figure(figsize=(10, 3))
plt.plot(max_features_range, recall_range, color='#EA5959', label='max recall: %(n)0.2f \n%(s)s: %(v)2d' %  
         {'n':max(dictionary.values()), 's':'max features', 'v':max(dictionary, key=lambda i: dictionary[i])})
plt.scatter([max(dictionary, key=lambda i: dictionary[i]), ], [max(dictionary.values()), ], 80, color='#EA5959')
axhline(max(dictionary.values()), color='#EA5959', linewidth=1, linestyle='--')
axvline(max(dictionary, key=lambda i: dictionary[i]), color='#EA5959', linewidth=1, linestyle='--')
plt.legend(loc='lower right', prop={'size':12}) 
plt.xlim(min(max_features_range), max(max_features_range))
plt.ylim(min(recall_range)*0.98, max(recall_range)*1.02)
plt.ylabel('Recall')
plt.xlabel('max features');


(3) min sample leaf


In [513]:
recall_range = []
min_samples_leaf_range = []

for i in np.arange(1, 20, 1):
    clf_RF = RandomForestClassifier(oob_score=True, n_estimators=18, max_features=14, min_samples_leaf=i).fit(X_train, y_train)
    clf_RF_predicted = clf_RF.predict(X_test)
    recall = round(recall_score(clf_RF_predicted, y_test), 2)
    
    min_samples_leaf_range.append(i)
    recall_range.append(recall)
    dictionary = dict(zip(min_samples_leaf_range, recall_range))
    
plt.figure(figsize=(10, 3))
plt.plot(min_samples_leaf_range, recall_range, color='#EA5959', label='max recall: %(n)0.2f \n%(s)s: %(v)2d' %  
         {'n':max(dictionary.values()), 's':'min samples leaf', 'v':max(dictionary, key=lambda i: dictionary[i])})
plt.scatter([max(dictionary, key=lambda i: dictionary[i]), ], [max(dictionary.values()), ], 80, color='#EA5959')
axhline(max(dictionary.values()), color='#EA5959', linewidth=1, linestyle='--')
axvline(max(dictionary, key=lambda i: dictionary[i]), color='#EA5959', linewidth=1, linestyle='--')
plt.legend(loc='lower right', prop={'size':12}) 
plt.xlim(min(min_samples_leaf_range), max(min_samples_leaf_range))
plt.ylim(min(recall_range)*0.98, max(recall_range)*1.02)
plt.ylabel('Recall')
plt.xlabel('min_samples_leaf_range');


  • very efficient way to find optimal parameter values
  • assigning 'appropriate' value range seems essential in grid search. For instance, I set the value range for 'max_features' to 10 to 28, since I wanted the model to be robust, at the expense of performance to some extent.

In [32]:
from sklearn.pipeline import Pipeline
pipeline_clf_train = Pipeline(
    steps=[
        ('clf_RF', RandomForestClassifier()),
    ]
);

In [34]:
from sklearn.grid_search import GridSearchCV
parameters = {
    'clf_RF__min_samples_leaf' : np.arange(1, 28, 1),
    'clf_RF__max_features' : np.arange(10, 28, 1),
    'clf_RF__criterion' :['gini', 'entropy'],
    'clf_RF__n_estimators' : [10],
    #'clf_RF__oob_score' : ['True]
}
gs_clf = GridSearchCV(pipeline_clf_train, parameters, n_jobs=-1, scoring='recall')
gs_clf = gs_clf.fit(X_train, y_train)

In [35]:
best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))
print('------------------------------')
print('recall score :', score.round(2))


clf_RF__criterion: 'entropy'
clf_RF__max_features: 19
clf_RF__min_samples_leaf: 27
clf_RF__n_estimators: 10
------------------------------
recall score : 0.77

2. Take a look at confusion matrix (what is confusion matrix?)

1) Greedy method

  • focal point of confusion matrix in this project is 'helpful' + 'recall'
  • with parameters obtained from greed method, I got 0.68/1.0 meaning that out of 129 'helpful reviews' I predicted 88 reviews correctly.

In [531]:
clf_RF = RandomForestClassifier(n_estimators=18, max_features=14, min_samples_leaf=9, oob_score=True).fit(X_train, y_train)
clf_RF_predicted = clf_RF.predict(X_test)

In [532]:
from sklearn.metrics import classification_report, confusion_matrix
target_names = ['not helpful', 'helpful']
print(classification_report(y_test, clf_RF_predicted, target_names=target_names))


             precision    recall  f1-score   support

not helpful       0.63      0.56      0.59       123
    helpful       0.62      0.68      0.65       129

avg / total       0.62      0.62      0.62       252


In [533]:
plt.figure(figsize=(4,4))
cm = confusion_matrix(y_test, clf_RF_predicted)
print(cm)
target_names = ['not helpful', 'helpful']
plt.grid(False)
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion matrix')
plt.colorbar()
tick_marks = np.arange(len(target_names))
plt.xticks(tick_marks, target_names, rotation=45)
plt.yticks(tick_marks, target_names)
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label');


[[69 54]
 [41 88]]
  • compared to the above, performance is better (0.75/1.0)

In [ ]:
clf_RF = RandomForestClassifier(n_estimators=10, max_features=19, min_samples_leaf=27, criterion='entropy', oob_score=True).fit(X_train, y_train)
clf_RF_predicted = clf_RF.predict(X_test)

In [338]:
from sklearn.metrics import classification_report, confusion_matrix
target_names = ['not helpful', 'helpful']
print(classification_report(y_test, clf_RF_predicted, target_names=target_names))


             precision    recall  f1-score   support

not helpful       0.67      0.52      0.58       123
    helpful       0.62      0.75      0.68       129

avg / total       0.64      0.64      0.63       252


In [359]:
plt.figure(figsize=(4, 4))
cm = confusion_matrix(y_test, clf_RF_predicted)
print(cm)
target_names = ['not helpful', 'helpful']
plt.grid(False)
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion matrix')
plt.colorbar()
tick_marks = np.arange(len(target_names))
plt.xticks(tick_marks, target_names, rotation=45)
plt.yticks(tick_marks, target_names)
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label');


[[64 59]
 [33 96]]

3. ROC Curve (what is ROC/AUC?)

  • Area Under the Curve (AUC) over 0.7 is considered useful in classification performance.
    • (Streiner and Cairney, 2007, What's under the roc? an introduction to receiver operating charcteristics curves.)
  • AUC calculated by grid search(0.68) is greater than that by greedy method(0.66)

1) Greedy Method


In [540]:
clf_RF = RandomForestClassifier(n_estimators=18, max_features=14, min_samples_leaf=9, oob_score=True).fit(X_train, y_train)
clf_RF_predicted = clf_RF.predict(X_test)

In [541]:
from sklearn.metrics import roc_curve, auc
fpr_rf, tpr_rf, thresholds_rf = roc_curve(y_test, clf_RF.predict_proba(X_test)[:, 0], pos_label=0)
fpr_base, tpr_base, thresholds_base = roc_curve(y_test,clf_Dummy.predict_proba(X_test)[:, 0], pos_label=1)

plt.figure(figsize=(5, 5))
plt.plot(fpr_rf, tpr_rf, color='#E45A84', linewidth=3, linestyle='-', 
         label = 'random forest: %(performance)0.2f' % {'performance':auc(fpr_rf, tpr_rf)})
plt.plot(fpr_base, tpr_base, color='#FFACAC', linewidth=2, linestyle='--', 
         label = 'baseline: %(performance)0.2f' % {'performance':auc(fpr_base, tpr_base)})
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (Fall-Out)')
plt.ylabel('True Positive Rate (Recall)')
plt.title('ROC (Receiver operating characteristic)', fontdict={'fontsize': 12})
plt.legend(loc="lower right");



In [318]:
clf_RF = RandomForestClassifier(n_estimators=10, max_features=19, min_samples_leaf=27, criterion='entropy', oob_score=True).fit(X_train, y_train)
clf_RF_predicted = clf_RF.predict(X_test)

In [317]:
fpr_rf, tpr_rf, thresholds_rf = roc_curve(y_test, clf_RF.predict_proba(X_test)[:, 0], pos_label=0)
fpr_base, tpr_base, thresholds_base = roc_curve(y_test,clf_Dummy.predict_proba(X_test)[:, 0], pos_label=1)

plt.figure(figsize=(5, 5))
plt.plot(fpr_rf, tpr_rf, color='#E45A84', linewidth=3, linestyle='-', 
         label = 'random forest: %(performance)0.2f' % {'performance':auc(fpr_rf, tpr_rf)})
plt.plot(fpr_base, tpr_base, color='#FFACAC', linewidth=2, linestyle='--', 
         label = 'baseline: %(performance)0.2f' % {'performance':auc(fpr_base, tpr_base)})
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (Fall-Out)')
plt.ylabel('True Positive Rate (Recall)')
plt.title('ROC (Receiver operating characteristic)', fontdict={'fontsize': 12})
plt.legend(loc="lower right");


4. So what?

  • In this analysis, I tried to find optimal parameters for the model with greedy method and grid search.
  • After checking several points, parameters of grid search turned out to outperform greedy method.
Greedy Method Grid Search
TPR 0.68 0.75
AUC 0.66 0.68